
# Preparamos el Setup. Llamando a todas las librerias que vamos a usar.
import requests
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
%matplotlib inline
# Lectura con Pandas del CSV y cración del Data Frame.
stockx = pd.read_csv('stockx_eda_cc_19.csv', sep=';')
df = pd.DataFrame(stockx)
df.head(10)
| Order Date | Brand | Sneaker Name | Sale Price | Retail Price | Release Date | Shoe Size | Buyer Region | |
|---|---|---|---|---|---|---|---|---|
| 0 | 1/9/17 | Yeezy | Adidas-Yeezy-Boost-350-Low-V2-Beluga | $1.097 | $220 | 24/9/16 | 11 | California |
| 1 | 1/9/17 | Yeezy | Adidas-Yeezy-Boost-350-V2-Core-Black-Copper | $685 | $220 | 23/11/16 | 11 | California |
| 2 | 1/9/17 | Yeezy | Adidas-Yeezy-Boost-350-V2-Core-Black-Green | $690 | $220 | 23/11/16 | 11 | California |
| 3 | 1/9/17 | Yeezy | Adidas-Yeezy-Boost-350-V2-Core-Black-Red | $1.075 | $220 | 23/11/16 | 11,5 | Kentucky |
| 4 | 1/9/17 | Yeezy | Adidas-Yeezy-Boost-350-V2-Core-Black-Red-2017 | $828 | $220 | 11/2/17 | 11 | Rhode Island |
| 5 | 1/9/17 | Yeezy | Adidas-Yeezy-Boost-350-V2-Core-Black-Red-2017 | $798 | $220 | 11/2/17 | 8,5 | Michigan |
| 6 | 1/9/17 | Yeezy | Adidas-Yeezy-Boost-350-V2-Core-Black-White | $784 | $220 | 17/12/16 | 11 | California |
| 7 | 1/9/17 | Yeezy | Adidas-Yeezy-Boost-350-V2-Cream-White | $460 | $220 | 29/4/17 | 10 | New York |
| 8 | 1/9/17 | Yeezy | Adidas-Yeezy-Boost-350-V2-Cream-White | $465 | $220 | 29/4/17 | 11 | Kansas |
| 9 | 1/9/17 | Yeezy | Adidas-Yeezy-Boost-350-V2-Cream-White | $465 | $220 | 29/4/17 | 11 | Florida |
#Procedemos a la limpieza de los datos y a sus conversiones.
df['Sneaker Name'] = df['Sneaker Name'].apply(lambda x: x.replace('-', ' '))
df['Retail Price'] = df['Retail Price'].apply(lambda x: x.replace('$', ''))
df['Sale Price'] = df['Sale Price'].apply(lambda x: x.replace('$', ''))
df['Shoe Size']=df['Shoe Size'].apply(lambda x : x.replace(',','.'))
df['Sale Price'] = df['Sale Price'].apply(lambda x: x.replace('.', ''))
df['Order Date'] = pd.to_datetime(df['Order Date']) #format='%d/%m/%Y')
df['Release Date'] = pd.to_datetime(df['Release Date']) #format='%d/%m/%Y')
# Convertimos los precios en enteros y la talla de pie en número con decimales.
df['Sale Price']=df['Sale Price'].astype(int)
df['Retail Price']=df['Retail Price'].astype(int)
df['Shoe Size']=df['Shoe Size'].astype(float)
# Ahora si mostramos todas las columnas con los valores que deseamos trabajar comodamente.
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 99956 entries, 0 to 99955 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Order Date 99956 non-null datetime64[ns] 1 Brand 99956 non-null object 2 Sneaker Name 99956 non-null object 3 Sale Price 99956 non-null int64 4 Retail Price 99956 non-null int64 5 Release Date 99956 non-null datetime64[ns] 6 Shoe Size 99956 non-null float64 7 Buyer Region 99956 non-null object dtypes: datetime64[ns](2), float64(1), int64(2), object(3) memory usage: 6.1+ MB
# No tenemos Datos nulos.
df.isnull().sum()
Order Date 0 Brand 0 Sneaker Name 0 Sale Price 0 Retail Price 0 Release Date 0 Shoe Size 0 Buyer Region 0 dtype: int64
# Creamos la función para calcular el beneficio neto.
def Beneficio_af(x,y):
'''
Stockx cobra al vendedor una serie de fees una vez aceptada la transacción,
suele rondar los 9,5% máximo sobre el precio de venta y un 3% por la verificación
de autenticidad y costes de envío.
'''
beneficio = x - y
return round((beneficio * .875),2)
# Agregamos ahora al df una nueva columna con el beneficio bruto de las ventas.
df['Profit'] = df.apply(lambda x: Beneficio_af(x['Sale Price'],x['Retail Price']), axis=1)
# Obtenemos las estadísticas de los valores numéricos.
round(df.describe(),1).T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Sale Price | 99956.0 | 446.6 | 256.0 | 186.0 | 275.0 | 370.0 | 540.0 | 4050.0 |
| Retail Price | 99956.0 | 208.6 | 25.2 | 130.0 | 220.0 | 220.0 | 220.0 | 250.0 |
| Shoe Size | 99956.0 | 9.3 | 2.3 | 3.5 | 8.0 | 9.5 | 11.0 | 17.0 |
| Profit | 99956.0 | 208.3 | 232.9 | -29.8 | 50.8 | 134.8 | 299.2 | 3377.5 |
# Realizamos unos conteos dentro de nuestro df para tener una primera vista de los datos.
tallas = df['Shoe Size'].value_counts(ascending=True)
tipos_por_marca =df['Sneaker Name'].value_counts(ascending=True)
zona = df['Buyer Region'].value_counts()
tallas.tail(10).plot(kind='barh', fontsize=10,color='m')
plt.title('Tallas de pie mas demandadas',fontsize=15,color='b')
Text(0.5, 1.0, 'Tallas de pie mas demandadas')
tipos_por_marca.tail(15).plot(kind='barh', fontsize=10, color='r')
plt.title('Modelos de Sneaker mas vendidos', fontsize=15,color='b')
Text(0.5, 1.0, 'Modelos de Sneaker mas vendidos')
marca = df['Brand'].value_counts()
total = df['Brand'].count()
prc = round(100*marca/total,2)
print (f'Las Yeezy representan el',prc[0],'% y las Off White el ',prc[1],'% de las ventas')
Las Yeezy representan el 72.19 % y las Off White el 27.81 % de las ventas
df.groupby(['Brand']).sum()
| Sale Price | Retail Price | Shoe Size | Profit | |
|---|---|---|---|---|
| Brand | ||||
| Yeezy | 25980669 | 15866540 | 666656.5 | 8849860.24 |
| Off-White | 18663151 | 4985640 | 267350.5 | 11967822.80 |
precio_de_compra_medio = df['Retail Price'].mean()
print('El precio de compra medio es: ',round(precio_de_compra_medio,2))
El precio de compra medio es: 208.61
precio_de_venta_medio = df['Sale Price'].mean()
print('El precio de venta medio es: ',round(precio_de_venta_medio,2))
El precio de venta medio es: 446.63
# Representamos graficamente las ventas
df_cat = ['Buyer Region', 'Sneaker Name', 'Retail Price', 'Shoe Size']
for cat in df_cat:
cat_num = df[str(cat)].value_counts()
plt.figure(figsize=(15,6))
chart = sns.barplot(x = cat_num.index, y= cat_num)
chart.set_title("Sneakers Sales by %s" % (cat))
plt.ylabel("Sneaker Sales")
plt.xlabel(cat)
chart.set_xticklabels(chart.get_xticklabels(), rotation = 90)
plt.gca().xaxis.set_major_locator(plt.MultipleLocator(1))
# Usamos seaborn y pintamos un boxplot para ver los outliers
plt.figure(figsize=(15,10))
sns.boxplot(x='Retail Price', y='Sale Price', data=df)
<AxesSubplot:xlabel='Retail Price', ylabel='Sale Price'>
import cufflinks as cf
from ipywidgets import interact
import plotly
plotly.offline.init_notebook_mode()
from plotly.offline import init_notebook_mode, iplot, plot
import plotly.graph_objs as go
from plotly import tools
import matplotlib.pyplot as plt
# Hacemos un filtro del df y luego lo agrupamos por zona y sumamos las ventas totales.
bds = df[['Brand', 'Sale Price','Order Date','Buyer Region']]
bds.index = bds['Order Date']
bds.drop('Order Date',axis=1)
bds2 = bds.groupby(['Buyer Region']).sum()['Sale Price'].sort_values(ascending=False)
bds2 = pd.DataFrame(bds2)
# representación interactiva de la suma de ventas totales.
trace1 = go.Bar(
x = bds2.index,
y = bds2['Sale Price'],
name = "Ventas",
marker = dict(color = 'rgba(255, 174, 255, 0.5)',
line=dict(color='rgb(0,0,0)',width=1.5)),
# text = df_2014.State)
)
layout = {
'xaxis': {'title': 'Ventas en millones de dolares'},
'title': 'Ventas Acumuladas por Estado'
}
fig = go.Figure(data = trace1, layout=layout)
iplot(fig)
# Representamos el beneficio mayor por ciudades
mm = df.sort_values('Profit',ascending=False).head(15)
mm= mm.groupby('Buyer Region')['Profit'].sum().sort_values(ascending=False)
mm.plot(kind='line',legend=True,figsize=(10,10), color='m')
plt.title('Beneficio de ventas por Ciudades')
Text(0.5, 1.0, 'Beneficio de ventas por Ciudades')
# Mostramos que marca genera mas beneficios a la hora de la reventa
df.groupby(['Brand']).sum().plot(kind='pie',y='Profit', autopct='%1.0f%%',
colors = ['y', 'pink'],
title='Team')
<AxesSubplot:title={'center':'Team'}, ylabel='Profit'>
# Realizamos una muestra por separado de cada marca.
off_white = df.copy()
off_white = off_white[off_white['Brand'].isin(['Off-White'])]
yeezy = df.copy()
yeezy = yeezy[yeezy['Brand'].isin([' Yeezy'])]
print('Shape of Off-White dataframe : {}'.format(off_white.shape))
print('Shape of Yeezy dataframe : {}'.format(yeezy.shape))
Shape of Off-White dataframe : (27794, 9) Shape of Yeezy dataframe : (72162, 9)
# Creamos la función para calcular el beneficio neto.
def comision(x,y):
'''
Stockx cobra al vendedor una serie de fees una vez aceptada la transacción,
suele rondar los 9,5% máximo sobre el precio de venta y un 3% por la verificación
de autenticidad y costes de envío.
'''
comision = x - y
return round((comision * (1-.875)),2)
yeezy['Fees_Y'] = yeezy.apply(lambda x: comision(x['Sale Price'],x['Retail Price']), axis=1)
yeezy.head(10).sort_values('Fees_Y',ascending=False)
| Order Date | Brand | Sneaker Name | Sale Price | Retail Price | Release Date | Shoe Size | Buyer Region | Profit | Fees_Y | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2017-01-09 | Yeezy | Adidas Yeezy Boost 350 Low V2 Beluga | 1097 | 220 | 2016-09-24 | 11.0 | California | 767.38 | 109.62 |
| 3 | 2017-01-09 | Yeezy | Adidas Yeezy Boost 350 V2 Core Black Red | 1075 | 220 | 2016-11-23 | 11.5 | Kentucky | 748.12 | 106.88 |
| 4 | 2017-01-09 | Yeezy | Adidas Yeezy Boost 350 V2 Core Black Red 2017 | 828 | 220 | 2017-11-02 | 11.0 | Rhode Island | 532.00 | 76.00 |
| 5 | 2017-01-09 | Yeezy | Adidas Yeezy Boost 350 V2 Core Black Red 2017 | 798 | 220 | 2017-11-02 | 8.5 | Michigan | 505.75 | 72.25 |
| 6 | 2017-01-09 | Yeezy | Adidas Yeezy Boost 350 V2 Core Black White | 784 | 220 | 2016-12-17 | 11.0 | California | 493.50 | 70.50 |
| 2 | 2017-01-09 | Yeezy | Adidas Yeezy Boost 350 V2 Core Black Green | 690 | 220 | 2016-11-23 | 11.0 | California | 411.25 | 58.75 |
| 1 | 2017-01-09 | Yeezy | Adidas Yeezy Boost 350 V2 Core Black Copper | 685 | 220 | 2016-11-23 | 11.0 | California | 406.88 | 58.12 |
| 8 | 2017-01-09 | Yeezy | Adidas Yeezy Boost 350 V2 Cream White | 465 | 220 | 2017-04-29 | 11.0 | Kansas | 214.38 | 30.62 |
| 9 | 2017-01-09 | Yeezy | Adidas Yeezy Boost 350 V2 Cream White | 465 | 220 | 2017-04-29 | 11.0 | Florida | 214.38 | 30.62 |
| 7 | 2017-01-09 | Yeezy | Adidas Yeezy Boost 350 V2 Cream White | 460 | 220 | 2017-04-29 | 10.0 | New York | 210.00 | 30.00 |
yeezy.mean().round(2)
/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/ipykernel_launcher.py:1: FutureWarning: DataFrame.mean and DataFrame.median with numeric_only=None will include datetime64 and datetime64tz columns in a future version. /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/ipykernel_launcher.py:1: FutureWarning: Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError. Select only valid columns before calling the reduction.
Sale Price 360.03 Retail Price 219.87 Shoe Size 9.24 Profit 122.64 Fees_Y 17.52 dtype: float64
off_white.head(10).sort_values('Sale Price',ascending=False)
plt.figure(figsize=(17,5))
plt.xlabel('Sneaker Name')
#chart.set_xticklabels(chart.get_xticklabels(), rotation = 180)
sns.countplot(off_white['Sneaker Name'].head(200));
/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
off_white['Fees_OF'] = off_white.apply(lambda x: comision(x['Sale Price'],x['Retail Price']), axis=1)
off_white.head(10).sort_values('Fees_OF',ascending=False)
| Order Date | Brand | Sneaker Name | Sale Price | Retail Price | Release Date | Shoe Size | Buyer Region | Profit | Fees_OF | |
|---|---|---|---|---|---|---|---|---|---|---|
| 149 | 2017-08-09 | Off-White | Air Jordan 1 Retro High Off White Chicago | 2450 | 190 | 2017-09-09 | 13.0 | New York | 1977.50 | 282.50 |
| 153 | 2017-08-09 | Off-White | Nike Air VaporMax Off White | 2399 | 250 | 2017-09-09 | 11.5 | Kentucky | 1880.38 | 268.62 |
| 150 | 2017-08-09 | Off-White | Air Jordan 1 Retro High Off White Chicago | 2325 | 190 | 2017-09-09 | 12.0 | Illinois | 1868.12 | 266.88 |
| 132 | 2017-07-09 | Off-White | Nike Air VaporMax Off White | 1800 | 250 | 2017-09-09 | 12.0 | Kentucky | 1356.25 | 193.75 |
| 128 | 2017-07-09 | Off-White | Nike Air Max 90 Off White | 1600 | 160 | 2017-09-09 | 8.0 | California | 1260.00 | 180.00 |
| 130 | 2017-07-09 | Off-White | Nike Air Presto Off White | 1344 | 160 | 2017-09-09 | 10.0 | New York | 1036.00 | 148.00 |
| 131 | 2017-07-09 | Off-White | Nike Air Presto Off White | 1325 | 160 | 2017-09-09 | 10.0 | Massachusetts | 1019.38 | 145.62 |
| 152 | 2017-08-09 | Off-White | Nike Air Presto Off White | 1299 | 160 | 2017-09-09 | 11.0 | Colorado | 996.62 | 142.38 |
| 129 | 2017-07-09 | Off-White | Nike Air Max 90 Off White | 1090 | 160 | 2017-09-09 | 11.5 | New York | 813.75 | 116.25 |
| 151 | 2017-08-09 | Off-White | Nike Air Max 90 Off White | 850 | 160 | 2017-09-09 | 11.0 | New Jersey | 603.75 | 86.25 |
off_white.mean().round(2)
/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/ipykernel_launcher.py:1: FutureWarning: DataFrame.mean and DataFrame.median with numeric_only=None will include datetime64 and datetime64tz columns in a future version. /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/ipykernel_launcher.py:1: FutureWarning: Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError. Select only valid columns before calling the reduction.
Sale Price 671.48 Retail Price 179.38 Shoe Size 9.62 Profit 430.59 Fees_OF 61.51 dtype: float64
plt.figure(figsize=(12,8))
sns.scatterplot(x=df['Retail Price'],
y=df['Profit'],
hue = df['Brand'],
s=100);
import folium
from folium import plugins
import ipywidgets
import geocoder
import geopy
from vega_datasets import data as vds
from branca.element import Figure
# Sacamos una lista con las ciudades
df_map = df['Buyer Region'].drop_duplicates()
df_map
pd.Series(df_map)
citys = list(df_map)
['California',
'Kentucky',
'Rhode Island',
'Michigan',
'New York',
'Kansas',
'Florida',
'New Jersey',
'Texas',
'North Carolina',
'Oregon',
'Alabama',
'Delaware',
'Virginia',
'Wisconsin',
'Colorado',
'Massachusetts',
'Pennsylvania',
'Louisiana',
'Washington',
'Georgia',
'Ohio',
'Nebraska',
'Oklahoma',
'Connecticut',
'Missouri',
'South Carolina',
'Maine',
'Illinois',
'Nevada',
'Maryland',
'Arizona',
'Minnesota',
'Iowa',
'Tennessee',
'West Virginia',
'Indiana',
'Arkansas',
'Alaska',
'Wyoming',
'Utah',
'New Hampshire',
'Vermont',
'District of Columbia',
'Hawaii',
'New Mexico',
'South Dakota',
'Mississippi',
'North Dakota',
'Idaho',
'Montana']
map_geojson = folium.Map(location=[41, -99], zoom_start=4)
folium.GeoJson('mapa_eda.geojson', name='geojson').add_to(map_geojson)
folium.LayerControl().add_to(map_geojson)
map_geojson